#!/usr/bin/env python from collections import defaultdict import math import scipy.io.wavfile import scipy from keras.models import Sequential from keras.layers import Dense, Dropout, Flatten from keras.layers import Conv1D, MaxPooling1D, LSTM from keras.layers import TimeDistributed, BatchNormalization, Activation # This script will train a recurrent convolutional network # After every epoch the weights will be saved with file rec_epoch_<#>_weights.hd5 ######## EDITABLE PARAMETERS ####### # POINT TO TRAINING CSV train_csv = 'trainsongs.csv' # POINT TO FOLDER WITH WAV FILES wav_directory = 'wavsongs/' # NUMBER OF SAMPLES IN THE WAV FILES nrsamples = 465984 # HOW LARGE THE SEGMENTATION WINDOW SHOULD BE segmentsize = 59049 # CONTROLS THE WINDOW STRIDE DURING SEGMENTATION stride = segmentsize # HOW MANY EPOCHS TO TRAIN FOR epochs = 50 # FILTERS FOR CONVOLUTIONAL LAYERS, ENSURE THERE ARE log(segmentsize)/log(3) FILTER SIZES convFilters = [64,64,128,128,128,128,128,128,256,256] # NODES IN LSTM LAYER LSTMnodes = 512 # LOWER THESE WHEN THERE IS NOT ENOUGH MEMORY AVAILABLE maxinmem = 800 batchSize = 8 #################################### def get_model_memory_usage(batch_size, model): import numpy as np from keras import backend as K shapes_mem_count = 0 for l in model.layers: single_layer_mem = 1 for s in l.output_shape: if s is None: continue single_layer_mem *= s shapes_mem_count += single_layer_mem trainable_count = np.sum([K.count_params(p) for p in set(model.trainable_weights)]) non_trainable_count = np.sum([K.count_params(p) for p in set(model.non_trainable_weights)]) total_memory = 4.0*batch_size*(shapes_mem_count + trainable_count + non_trainable_count) gbytes = np.round(total_memory / (1024.0 ** 3), 3) return gbytes class Clip: def __init__(self, songinfo): self.artist = songinfo[0] self.title = songinfo[1] self.album = songinfo[2] self.path = songinfo[3] def asString(self): return self.artist + ' - ' + self.title + ', ' + self.album + ', path: ' + self.path def GetTrainingData(): train = open(train_csv) songsPerArtist = defaultdict(int) clips = [] artistIndex = dict() for line in train: songinfo = line.strip('\n').split('\t') songsPerArtist[songinfo[0]] += 1 if(songsPerArtist[songinfo[0]] == 1): artistIndex[songinfo[0]] = len(artistIndex) newClip = Clip(songinfo) clips.append(newClip) return artistIndex, clips def GetSegmentedWav(clip, segmentsize, nrsamples, stride): (rate, wavdata) = scipy.io.wavfile.read(wav_directory + clip.path) segments = segments = int(math.ceil( (nrsamples-segmentsize) / float(stride))) segwav = scipy.zeros((segments,segmentsize)) for i in range(0,segments): segwav[i] = wavdata[i*stride:i*stride+segmentsize] return segwav.reshape(segments,segmentsize,1) def GetSlices(clips, maxperslice): slices = [((i*maxperslice), ((i+1)*maxperslice)) for i in range(0,clips/maxperslice)] slices.append( (slices[-1][1], clips)) return slices; def CreateModel(segmentsize, segments, classes): model = Sequential() model.add(TimeDistributed(Conv1D(convFilters[0],3,strides=3),input_shape=(segments,segmentsize,1))) model.add(TimeDistributed(BatchNormalization())) model.add(TimeDistributed(Activation('relu'))) for i in range(1, len(convFilters)): model.add(TimeDistributed(Conv1D(convFilters[i],3,padding='same'))) model.add(TimeDistributed(BatchNormalization())) model.add(TimeDistributed(Activation('relu'))) model.add(TimeDistributed(MaxPooling1D(3))) # After final convolutional layer flatten the output for fully connected recurrent layer model.add(TimeDistributed(Flatten())) #Add dropout to combat overfitting model.add(TimeDistributed(Dropout(0.5))) model.add(LSTM(LSTMnodes)) model.add(Dense(classes)) model.add(BatchNormalization()) model.add(Activation('softmax')) model.compile('Adam', 'categorical_crossentropy') return model segments = int(math.ceil( (nrsamples-segmentsize) / float(stride))) epochstart = 0 (artistIndex, clips) = GetTrainingData() nrclips = len(clips) nrartists = len(artistIndex) model = CreateModel(segmentsize,segments,nrartists) model.summary() mem = get_model_memory_usage(batchSize, model) meminput = segments*segmentsize*maxinmem*4 / (1024.0 ** 3) print("Mem use model: {}".format(mem)) print("Mem use input: {}".format(meminput)) print("Total is: {}".format(mem+meminput)) slices = GetSlices(nrclips, maxinmem) for epoch in range(epochstart,epochs): print('epoch: {}'.format(epoch)) filename = 'rec_epoch_' + str(epoch) + '_weights.hd5' order = scipy.random.permutation(nrclips) c = 0 tot = 0 for sli in slices: batch = scipy.array([GetSegmentedWav(clips[i],segmentsize,nrsamples,stride) for i in order[sli[0]:sli[1]]]) target = scipy.zeros((sli[1]-sli[0], nrartists)) for i in range(0,sli[1]-sli[0]): target[i][artistIndex[clips[order[sli[0]+i]].artist]] = 1 c += 1 print('{}: Training clips {} - {} '.format(c, tot, tot + sli[1] - sli[0] - 1)) tot += sli[1] - sli[0] model.fit(batch, target, batch_size=batchSize, epochs=1, verbose=0) batch = None model.save_weights(filename)